
#include <machine/asm.h>

#if defined(NEON)
#define	STORE8		vst1.32		{d0}, [ip:64]!
#define	STORE16		vst1.32		{d0-d1}, [ip:64]!
#define	STORE32		vst1.32		{d0-d3}, [ip:64]!
#elif defined(VFP)
#define	STORE8		vstmia		ip!, {d0}
#define	STORE16		vstmia		ip!, {d0-d1}
#define	STORE32		vstmia		ip!, {d0-d3}
#elif defined(_ARM_ARCH_DWORD_OK)
#define	STORE8		strd		r2, [ip], #8
#define	STORE16		STORE8; STORE8
#define	STORE32		STORE16; STORE16
#else
#define	STORE8		stmia		ip!, {r2,r3}
#define	STORE16		STORE8; STORE8
#define	STORE32		STORE16; STORE16
#endif
/*
 * memset: Sets a block of memory to the specified value
 * Using NEON instructions
 *
 * On entry:
 *   r0 - dest address
 *   r1 - byte to write
 *   r2 - number of bytes to write
 *
 * On exit:
 *   r0 - dest address
 */
/* LINTSTUB: Func: void *memset(void *, int, size_t) */
ENTRY(memset)
	ands		r3, r1, #0xff	/* We deal with bytes */
	orrne		r3, r3, r3, lsl #8	/* replicate to all bytes */
	orrne		r3, r3, r3, lsl #16	/* replicate to all bytes */
	movs		r1, r2		/* we need r2 & r3 */
	RETc(eq)			/* return if length is 0 */
	mov		ip, r0		/* r0 needs to stay the same */

	cmp		r1, #12		/* is this a small memset? *?
	blt		.Lbyte_by_byte	/*   then do it byte by byte */

	/* Ok first we will dword align the address */
	ands		r2, ip, #7	/* grab the bottom three bits */
	beq		.Lmemset_dwordaligned	/* The addr is dword aligned */

	rsb		r2, r2, #8	/* how far until dword aligned? */
	sub		r1, r1, r2	/* subtract it from remaining length */
	mov		r2, r3		/* duplicate fill value */

	tst		ip, #1		/* halfword aligned? */
	strneb		r3, [ip], #1	/*   no, write a byte */
	tst		ip, #2		/* word aligned? */
	strneh		r3, [ip], #2	/*   no, write a halfword */
	tst		ip, #4		/* dword aligned? */
	strne		r3, [ip], #4	/*   no, write a word */

	/* We are now doubleword aligned */
.Lmemset_dwordaligned:
#if defined(NEON)
	vdup.8		q0, r3		/* move fill to SIMD */
	vmov		q1, q0		/* put fill in q1 (d2-d3) */
#elif defined(VFP)
	mov		r2, r3		/* duplicate fill value */
	vmov		d0, r2, r3	/* move to VFP */
	vmov		d1, r2, r3
	vmov		d2, r2, r3
	vmov		d3, r2, r3
#endif

#if 1
	cmp		r1, #128
	blt		.Lmemset_mainloop
	ands		r2, ip, #63	/* check for 64-byte alignment */
	beq		.Lmemset_mainloop
	/*
	 * Let's align to a 64-byte boundary so that stores don't cross
	 * cacheline boundaries.  We also know we have at least 128-bytes to
	 * copy so we don't have to worry about the length at the moment.
	 */
	rsb		r2, r2, #64	/* how many bytes until 64 bytes */
	sub		r1, r1, r2	/* subtract from remaining length */
#if !defined(NEON) && !defined(VFP)
	mov		r2, r3		/* put fill back in r2 */
#endif

	tst		ip, #8		/* quadword aligned? */
	beq		1f		/*   yes */
	STORE8				/*   no, store a dword */
1:	tst		ip, #16		/* octaword aligned? *?
	beq		2f		/*   yes */
	STORE16				/*   no, store a quadword */
2:	tst		ip, #32		/* 32 word aligned? */
	beq		.Lmemset_mainloop		/*   yes */
	STORE32				/*   no, make 64-byte aligned */
#endif

.Lmemset_mainloop:
#if !defined(NEON) && !defined(VFP)
	mov		r2, r3		/* put fill back in r2 */
#endif
	subs		r1, r1, #64	/* subtract an initial 64 */
	blt		.Lmemset_lessthan_64bytes

3:	STORE32				/* store first octaword */
	STORE32				/* store second octaword */
	RETc(eq)			/* return if done */
	subs		r1, r1, #64	/* subtract another 64 */
	bge		3b		/* and do other if still >= 0 */
.Lmemset_lessthan_64bytes:
	tst		r1, #32		/* do we have 16 bytes left? */
	beq		.Lmemset_lessthan_32bytes
	STORE32				/*    yes, store an octaword */
	bics		r1, r1, #32	/* subtract 16 */
	RETc(eq)			/* return if length is 0 */
.Lmemset_lessthan_32bytes:
	tst		r1, #16		/* do we have 16 bytes left? */
	beq		.Lmemset_lessthan_16bytes
	STORE16				/*   yes, store a quadword */
	bics		r1, r1, #16	/* subtract 16 */
	RETc(eq)			/* return if length is 0 */
.Lmemset_lessthan_16bytes:
	tst		r1, #8		/* do we have 8 bytes left? */
	beq		.Lmemset_lessthan_8bytes/*   no */
	STORE8				/*   yes, store a dword */
	bics		r1, r1, #8	/* subtract 8 */
	RETc(eq)			/* return if length is 0 */
.Lmemset_lessthan_8bytes:
	tst		r1, #4		/* do we have a word left? */
	strne		r2, [ip], #4	/*   yes, so write one */
	tst		r1, #2		/* do we have a halfword left? */
	strneh		r2, [ip], #2	/*   yes, so write one */
	tst		r1, #1		/* do we have a byte left? */
	strneb		r2, [ip], #1	/*   yes, so write one */
	RET				/* return */

.Lbyte_by_byte:
	subs		r1, r1, #1	/* can we write a byte? */
	RETc(lt)			/*   no, we're done */
	strb		r3, [ip], #1	/*   yes, so do it */
	b		.Lbyte_by_byte	/* try next byte */
END(memset)
